library(RColorBrewer)
library(ISLR)
library(MASS)
library(rpart)
library(rpart.plot)
library(ggplot2)
theme_set(theme_bw(base_size = 14))
cols <- brewer.pal(9, "Set1")
# Load the data
data("Boston")
# Split the data in training and test
n <- nrow(Boston)
n_train <- floor(0.8 * n)
n_test <- n - n_train
set.seed(123)
idx_test <- sample(1:n, n_test, replace = F)
Boston_ts <- Boston[idx_test,]
Boston <- Boston[-idx_test,]
# Create grid where we make prediction
xgrid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 1000)
# Fit a deep tree
deep_tree <- rpart(medv ~ lstat, method = "anova", data = Boston,
control = rpart.control(minsplit = 5, cp = .0005))
cat('Size of deep tree:', length(unique(deep_tree$where)), '\n')
Size of deep tree: 78
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(deep_tree)
# Pick the best Cp
best_alpha <- deep_tree$cptable[which.min(deep_tree$cptable[,"xerror"]),"CP"]
cat('Best alpha:', best_alpha, '\n')
Best alpha: 0.00607656
# Prune the tree down to that Cp
best_tree <- prune(deep_tree, cp = best_alpha)
cat('Size of best tree:', length(unique(best_tree$where)), '\n')
Size of best tree: 7
# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)
best_tree
n= 404
node), split, n, deviance, yval
* denotes terminal node
1) root 404 35629.5400 22.72426
2) lstat>=5.44 340 14113.0300 19.93265
4) lstat>=14.395 145 3091.8700 15.13103
8) lstat>=19.83 64 1039.8170 12.45625 *
9) lstat< 19.83 81 1232.3800 17.24444 *
5) lstat< 14.395 195 5192.2580 23.50308
10) lstat>=9.725 90 658.9840 21.04667 *
11) lstat< 9.725 105 3524.7420 25.60857 *
3) lstat< 5.44 64 4790.5990 37.55469
6) lstat>=4.475 30 1309.4700 32.49667 *
7) lstat< 4.475 34 2036.4090 42.01765
14) lstat>=3.425 18 1207.3360 39.17222 *
15) lstat< 3.425 16 519.3844 45.21875 *
pred <- as.numeric(predict(best_tree, newdata = list(lstat = xgrid)))
par(mar=c(4,4,2,2), family = 'serif')
plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, pred, lwd = 2, col = cols[1])
Show suboptimal trees
alpha_vec <- c(0.1, best_alpha, 3E-3)
par(mfrow=c(3,2), mar = c(2,2,2,2), family = 'serif')
for(i in 1:3){
plot(Boston$lstat, Boston$medv, pch = 16, cex = .5,
xlab = 'lstat', ylab = 'medv')
ptree <- prune(deep_tree, cp = alpha_vec[i])
pred <- as.numeric(predict(ptree, newdata = list(lstat = xgrid)))
lines(xgrid, pred, col = 'red', lwd = 2)
title(paste('alpha =',round(alpha_vec[i],4)))
rpart.plot(ptree, clip.right.labs = FALSE, under = TRUE, digits = 4)
}
# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_1D <- (pred_y - Boston_ts$medv)^2
plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')
n_lev <- 1000
cols_cont <- colorRampPalette(cols[c(2,3,6,1)])(n_lev)
levels <- seq(min(Boston$medv), min(Boston$medv), length.out = n_lev)
par(mar=c(4,4,2,2), family = 'serif')
plot(Boston$lstat, Boston$dis, pch = 16, xlab = 'lstat', ylab = 'dis', col = cols_cont[cut(Boston$medv, n_lev)])
best_tree <- rpart(medv ~ lstat + dis, method = "anova", data = Boston)
cat('Size of optimal tree:', length(unique(best_tree$where)), '\n')
Size of optimal tree: 8
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(best_tree)
# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)
# Make perspective plot
x1_grid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 100)
x2_grid <- seq(min(Boston$dis), max(Boston$dis), length.out = 100)
X_pred <- expand.grid(x1_grid, x2_grid)
X_pred <- data.frame(lstat = X_pred[,1], dis = X_pred[,2])
pred <- predict(best_tree, newdata = X_pred)
par(mar=c(1,1,1,1), family = 'serif')
persp(x1_grid, x2_grid, matrix(pred, ncol = length(x2_grid), byrow = T),
theta = 150, xlab = 'dis', ylab = 'lstat', zlab = 'medv',
zlim = range(Boston$medv), lwd = 0.2)
# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_2D <- (pred_y - Boston_ts$medv)^2
plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')
sqrt(mean(err_1D)); sqrt(mean(err_2D))
[1] 6.022948
[1] 5.147602
best_tree <- rpart(medv ~ ., method = "anova", data = Boston)
cat('Size of big tree:', length(unique(best_tree$where)), '\n')
Size of big tree: 8
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(best_tree)
# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)
# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_all <- (pred_y - Boston_ts$medv)^2
plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')
sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all))
[1] 6.022948
[1] 5.147602
[1] 4.743209
# Let's first try to show how Bagging works
xgrid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 1000)
B <- 1000
pred_y <- array(NA, dim = c(B, length(xgrid)))
for (b in 1:B){
Boston_b <- Boston[sample(1:n_train, n_train, TRUE),]
deep_tree <- rpart(medv ~ lstat, method = "anova", data = Boston_b,
control = rpart.control(minsplit = 5, cp = .0025))
pred_y[b,] <- as.numeric(predict(deep_tree, newdata = list(lstat = xgrid)))
}
plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, pred_y[1,], lwd = 2, col = 'red')
lines(xgrid, pred_y[2,], lwd = 2, col = 'blue')
lines(xgrid, pred_y[3,], lwd = 2, col = 'green')
plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, colMeans(pred_y), lwd = 2, col = 'red')
# Let's now use the package for random forests
rffit <- randomForest(medv ~ ., data = Boston, ntree = 200)
plot(rffit, main = 'OOB Error')
# Maybe we can achieve better OOB error?
rffit <- randomForest(medv ~ ., data = Boston, ntree = 1000)
plot(rffit, main = 'OOB Error')
pred_y <- predict(rffit, newdata = Boston_ts)
plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')
err_RF <- (pred_y - Boston_ts$medv)^2
sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all)); sqrt(mean(err_RF))
[1] 6.022948
[1] 5.147602
[1] 4.743209
[1] 3.482059
# We can change the number of variables to be considered at each split
p <- ncol(Boston) - 1
rffit1 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = p)
plot(rffit1, main = 'OOB Error', ylim = c(10, 25))
rffit2 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = floor(p/2))
plot(rffit2, add = T, col = 'red')
rffit3 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = floor(sqrt(p)))
plot(rffit3, add = T, col = 'blue')
legend('topright', legend = c('m = p', 'm = p/2', 'm = sqrt(p)'), col = c('black', 'red', 'blue'), lwd = 2)
plot(importance(rffit3), xlab = 'variables', ylab = 'importance',
xaxt = 'n', type = 'h', lwd = 3)
axis(1, at = 1:length(importance(rffit3)), labels = rownames(importance(rffit3)))
set.seed(123)
n <- 500
lambda <- 0.5
X <- runif(n, 0, 10)
Y <- rnorm(n, sin(2*X), 0.75)
xgrid <- seq(min(X), max(X), length.out = 1000)
shallow_tree <- rpart(Y ~ X, method = "anova", control = list(maxdepth = 4))
pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))
plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')
res <- Y - lambda * pred_y_train
shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))
pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))
plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')
res <- res - lambda * pred_y_train
shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))
pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))
plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')
res <- res - lambda * pred_y_train
shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))
pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))
plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')
library(xgboost)
Boston_X <- Boston
Boston_X$medv <- NULL
Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)
fit1_xgb <- xgb.cv(
data = Boston_xgb,
nrounds = 5000,
nfold = 5,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
ggplot(fit1_xgb$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
fit2_xgb <- xgb.cv(
params = list(eta = 0.01),
data = Boston_xgb,
nrounds = 5000,
nfold = 5,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
ggplot(fit2_xgb$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)
# create hyperparameter grid
hyper_grid <- expand.grid(
eta = c(.01, .05, .15, 0.3),
max_depth = c(1, 3, 5, 7),
optimal_trees = 0, # a place to dump results
min_RMSE = 0 # a place to dump results
)
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i]
)
# reproducibility
set.seed(123)
# train model
xgb.tune <- xgb.cv(
params = params,
data = Boston_xgb,
nrounds = 5000,
nfold = 5,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
# add min training error and trees to grid
hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
hyper_grid$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}
hyper_grid[order(hyper_grid$min_RMSE, decreasing = F),]
eta max_depth optimal_trees min_RMSE
12 0.30 5 36 3.726241
6 0.05 3 208 3.777685
8 0.30 3 35 3.794773
5 0.01 3 736 3.802070
7 0.15 3 61 3.810126
10 0.05 5 204 3.829675
15 0.15 7 78 3.862303
11 0.15 5 57 3.874303
16 0.30 7 17 3.875736
13 0.01 7 755 3.916442
9 0.01 5 613 3.917884
14 0.05 7 125 3.984433
3 0.15 1 97 4.077576
2 0.05 1 305 4.083208
4 0.30 1 53 4.100558
1 0.01 1 1012 4.142000
Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)
# parameter list
params <- list(
eta = hyper_grid$eta[which.min(hyper_grid$min_RMSE)],
max_depth = hyper_grid$max_depth[which.min(hyper_grid$min_RMSE)]
)
# train final model
xgb.fit.final <- xgboost(
params = params,
data = Boston_xgb,
nrounds = hyper_grid$optimal_trees[which.min(hyper_grid$min_RMSE)],
objective = "reg:squarederror",
verbose = 0
)
# create importance matrix
importance_matrix <- xgb.importance(model = xgb.fit.final)
# variable importance plot
xgb.plot.importance(importance_matrix, top_n = 10, measure = "Gain")
Boston_ts_X <- Boston_ts
Boston_ts_X$medv <- NULL
Boston_test_xgb <- xgb.DMatrix(as.matrix(Boston_ts_X), label = Boston_ts$medv)
pred_y <- predict(xgb.fit.final, Boston_test_xgb)
plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')
err_boost <- (pred_y - Boston_ts$medv)^2
sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all)); sqrt(mean(err_RF)); sqrt(mean(err_boost))
[1] 6.022948
[1] 5.147602
[1] 4.743209
[1] 3.482059
[1] 2.727177